########################################################
## PROGRAM NAME: 009_sc.R                             ##
## AUTHOR: MATT MLECZKO                               ##
## DATE CREATED: 05/05/2021                           ##
## INPUTS:                                            ##
##    002_af_wide_final.Rda                           ##
##                                                    ##
## OUTPUTS:                                           ##
##    009_zip_all_sc.Rda                              ##
##                                                    ## 
## PURPOSE: Social capital analyses                   ##
##          create Figures 6 and 7                    ##
##          and Table 5                               ##
##                                                    ##
## LIST OF UPDATES:                                   ##
##                                                    ##
########################################################

log <- file("009_sc.txt") 
sink(log, append=TRUE)
sink(log, append=TRUE, type="message")

set.seed(08540)

## load libraries ##

library(tidycensus)
library(tidyverse)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(gsubfn)
library(readxl)
library(stargazer)
library(ggplot2)
library(haven)
library(webr)

## define paths ##
data_path <- "PATH TO DATA HERE"
progs <- "PATH TO PROGRAMS HERE"

## set working directory ##
setwd(data_path)

##
## load necessary functions ## 
##

`%notin%` <- Negate(`%in%`)
source(paste0(progs,"stata_merge.R"))
source(paste0(progs,"hud_group_zip.R"))
source(paste0(progs,"rd_data_zip.R"))

## load the analytic file ##
load("002_af_wide_final.Rda")
load("001_2020_Census_del.Rda")

census_api_key("CENSUS API KEY HERE")

## MSA-FIPS crosswalk ##
fips.to.msas <- census.del.2020.final %>%
  filter(`Metropolitan/Micropolitan Statistical Area` == "Metropolitan Statistical Area") %>%
  select(FIPS,
        `CBSA Code`) %>%
  rename(CBSA = `CBSA Code`)

##
## step 1: aggregate 2020 tract level data to zip level ##
##

load("002_trts_fsc.Rda")

## initial processing ##
all.tracts.2020.fzip1 <- all.tracts.2020.cbpf %>% 
  group_by(GEOID) %>%
  summarize_at(vars(pop_total:nhwh_hhld_inc16),
               funs(sum(., na.rm=T))) %>%
  mutate(hhs_inc = rowSums(cbind(aian_hhld_inc,as_hhld_inc,bl_hhld_inc,nhpi_hhld_inc,
                                 hl_hhld_inc,oth_hhld_inc,tw_hhld_inc,nhwh_hhld_inc),na.rm=T),
         FIPS = substr(GEOID,1,5))

all.tracts.2020.fzip2 <- all.tracts.2020.cbpf %>% 
  group_by(GEOID) %>%
  mutate(pop_wt = pop_total/sum(pop_total,na.rm=T)) %>%
  summarize_at(vars(median_hhld_inc),
               funs(weighted.mean(., pop_wt, na.rm=T)))

all.tracts.2020.fzip.m <- stata.merge(all.tracts.2020.fzip1,
                                      all.tracts.2020.fzip2, 
                                      "GEOID")

## check merge ## 
paste("MERGE BETWEEN ZIP FILES A AND B")
table(all.tracts.2020.fzip.m$merge.variable, useNA = "ifany")

## keep matches ##
all.tracts.2020.fzip <- all.tracts.2020.fzip.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

paste("OBS IN ZIP FILE A")
nrow(all.tracts.2020.fzip1)
paste("OBS IN ZIP FILE B")
nrow(all.tracts.2020.fzip2)
paste("OBS IN MERGED ZIP FILE")
nrow(all.tracts.2020.fzip)
paste("SUMMARY OF MERGED ZIP FILE")
summary(all.tracts.2020.fzip)
paste("MISSINGS IN MERGED ZIP FILE")
sapply(all.tracts.2020.fzip, function(x) sum(is.na(x)))

paste("SUMMARY OF MHI IN 2020 TRACT FILE")
summary(all.tracts.2020.cbpf$median_hhld_inc)
paste("SUMMARY OF MHI IN 2020 ZIP FILE")
summary(all.tracts.2020.fzip$median_hhld_inc)

## 
## aggregate tracts to zip code
##

# read-in tract zip xwalk ##
load("001_ziptrt_xwalk.Rda")

## pre-merge checks ##
paste("CLASS OF GEOID IN TRACT FILE")
class(all.tracts.2020.fzip$GEOID)
paste("GEOID CHECK IN TRACT FILE")
range(nchar(trim(all.tracts.2020.fzip$GEOID)))
paste("IS TRACT FILE UNIQUE BY GEOID?")
length(unique(all.tracts.2020.fzip$GEOID)) == nrow(all.tracts.2020.fzip)

paste("CLASS OF GEOID IN XWALK")
class(ziptrt.xwalk.in$GEOID)
paste("GEOID CHECK IN XWALK")
range(nchar(trim(ziptrt.xwalk.in$GEOID)))
paste("IS XWALK UNIQUE BY GEOID?")
length(unique(ziptrt.xwalk.in$GEOID)) == nrow(ziptrt.xwalk.in)

## merge files ##
trt.zip.2020.m <- stata.merge(all.tracts.2020.fzip,
                              ziptrt.xwalk.in,
                              "GEOID")

## check merge ##
paste("MERGE BETWEEN TRACT FILE AND ZIPTRACT XWALK")
table(trt.zip.2020.m$merge.variable, useNA = "ifany")
prop.table(table(trt.zip.2020.m$merge.variable, useNA = "ifany"))

##
## process race/ethnicity data ##
##

zip.2020.er <- trt.zip.2020.m %>%
  filter(merge.variable == 3) %>%
  select(-GEOID,
         -merge.variable) %>%
  select(ZIP,
         TRACT,
         USPS_ZIP_PREF_CITY,
         USPS_ZIP_PREF_STATE,
         RES_RATIO,
         BUS_RATIO,
         OTH_RATIO,
         TOT_RATIO,
         FIPS,
         pop_total,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_black,
         pop_hl,
         pop_nh_multi,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_white) %>%
  mutate_at(vars(pop_total:pop_nh_white), ~ . *RES_RATIO) %>%
  group_by(ZIP) %>%
  summarize_if(is.numeric, sum, na.rm=T) %>%
  filter(pop_total >= 500) %>%
  mutate(per_aian = pop_nh_aian/pop_total,
         per_asian = pop_nh_asian/pop_total,
         per_black = pop_nh_black/pop_total,
         per_multi = pop_nh_multi/pop_total,
         per_nhpi = pop_nh_nhpi/pop_total,
         per_hl = pop_hl/pop_total,
         per_other = rowSums(cbind(pop_nh_other, pop_nh_nhpi, pop_nh_multi),na.rm=T)/pop_total,
         per_white = pop_nh_white/pop_total,
         log_asian = case_when(pop_nh_asian != 0 ~ log(1/per_asian),
                               pop_nh_asian == 0 ~ 0),
         log_black = case_when(pop_nh_black != 0 ~ log(1/per_black),
                               pop_nh_black == 0 ~ 0),
         log_hl = case_when(pop_hl != 0 ~ log(1/per_hl),
                                pop_hl == 0 ~ 0),
         log_white = case_when(pop_nh_white != 0 ~ log(1/per_white),
                               pop_nh_white == 0 ~ 0),
         log_aian = case_when(pop_nh_aian != 0 ~ log(1/per_aian),
                              pop_nh_aian == 0 ~ 0),
         log_other = case_when(rowSums(cbind(pop_nh_other, pop_nh_nhpi, pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other),
                               rowSums(cbind(pop_nh_other, pop_nh_nhpi, pop_nh_multi),na.rm=T) == 0 ~ 0),
         trt_entropy_er = per_asian*log_asian + 
                          per_black*log_black + 
                          per_hl*log_hl + 
                          per_white*log_white +
                          per_aian*log_aian +
                          per_other*log_other) %>%
  rename(GEOID = ZIP)

paste("MISSINGS IN ZIP ER FILE")
sapply(zip.2020.er, function(x) sum(is.na(x)))
paste("SUMMARY OF ZIP ER ENTROPY")
summary(zip.2020.er$trt_entropy_er)

##
## step 2: get 2018-2022 zip code data for SES vars ##
##

## get zip code data ## 
v2022 <- load_variables(2022, "acs5")

zip.2022.in <- get_acs(geography = "zcta",
                       variables = c(pop_total = "B01003_001",
                                     hisp_race_den = "B03002_001",
                                     pop_nh = "B03002_002",
                                     pop_nh_white = "B03002_003",
                                     pop_nh_black = "B03002_004",
                                     pop_nh_aian = "B03002_005",
                                     pop_nh_asian = "B03002_006",
                                     pop_nh_nhpi = "B03002_007",
                                     pop_nh_other = "B03002_008",
                                     pop_nh_multi = "B03002_009",
                                     pop_h = "B03002_012",
                                     pop_h_white = "B03002_013",
                                     pop_h_black = "B03002_014",
                                     pop_h_aian = "B03002_015",
                                     pop_h_asian = "B03002_016",
                                     pop_h_nhpi = "B03002_017",
                                     pop_h_other = "B03002_018",
                                     pop_h_multi = "B03002_019",
                                     hhld_pov_num = "B17017_002",
                                     hhld_pov_den = "B17017_001",
                                     median_hhld_inc = "B19013_001",
                                     hhld_inc = "B19001_001",
                                     hhld_inc1 = "B19001_002",
                                     hhld_inc2 = "B19001_003",
                                     hhld_inc3 = "B19001_004",
                                     hhld_inc4 = "B19001_005",
                                     hhld_inc5 = "B19001_006",
                                     hhld_inc6 = "B19001_007",
                                     hhld_inc7 = "B19001_008",
                                     hhld_inc8 = "B19001_009",
                                     hhld_inc9 = "B19001_010",
                                     hhld_inc10 = "B19001_011",
                                     hhld_inc11 = "B19001_012",
                                     hhld_inc12 = "B19001_013",
                                     hhld_inc13 = "B19001_014",
                                     hhld_inc14 = "B19001_015",
                                     hhld_inc15 = "B19001_016",
                                     hhld_inc16 = "B19001_017",
                                     wh_hhld_inc = "B19001A_001",
                                     wh_hhld_inc1 = "B19001A_002",
                                     wh_hhld_inc2 = "B19001A_003",
                                     wh_hhld_inc3 = "B19001A_004",
                                     wh_hhld_inc4 = "B19001A_005",
                                     wh_hhld_inc5 = "B19001A_006",
                                     wh_hhld_inc6 = "B19001A_007",
                                     wh_hhld_inc7 = "B19001A_008",
                                     wh_hhld_inc8 = "B19001A_009",
                                     wh_hhld_inc9 = "B19001A_010",
                                     wh_hhld_inc10 = "B19001A_011",
                                     wh_hhld_inc11 = "B19001A_012",
                                     wh_hhld_inc12 = "B19001A_013",
                                     wh_hhld_inc13 = "B19001A_014",
                                     wh_hhld_inc14 = "B19001A_015",
                                     wh_hhld_inc15 = "B19001A_016",
                                     wh_hhld_inc16 = "B19001A_017",
                                     bl_hhld_inc = "B19001B_001",
                                     bl_hhld_inc1 = "B19001B_002",
                                     bl_hhld_inc2 = "B19001B_003",
                                     bl_hhld_inc3 = "B19001B_004",
                                     bl_hhld_inc4 = "B19001B_005",
                                     bl_hhld_inc5 = "B19001B_006",
                                     bl_hhld_inc6 = "B19001B_007",
                                     bl_hhld_inc7 = "B19001B_008",
                                     bl_hhld_inc8 = "B19001B_009",
                                     bl_hhld_inc9 = "B19001B_010",
                                     bl_hhld_inc10 = "B19001B_011",
                                     bl_hhld_inc11 = "B19001B_012",
                                     bl_hhld_inc12 = "B19001B_013",
                                     bl_hhld_inc13 = "B19001B_014",
                                     bl_hhld_inc14 = "B19001B_015",
                                     bl_hhld_inc15 = "B19001B_016",
                                     bl_hhld_inc16 = "B19001B_017",
                                     aian_hhld_inc = "B19001C_001",
                                     aian_hhld_inc1 = "B19001C_002",
                                     aian_hhld_inc2 = "B19001C_003",
                                     aian_hhld_inc3 = "B19001C_004",
                                     aian_hhld_inc4 = "B19001C_005",
                                     aian_hhld_inc5 = "B19001C_006",
                                     aian_hhld_inc6 = "B19001C_007",
                                     aian_hhld_inc7 = "B19001C_008",
                                     aian_hhld_inc8 = "B19001C_009",
                                     aian_hhld_inc9 = "B19001C_010",
                                     aian_hhld_inc10 = "B19001C_011",
                                     aian_hhld_inc11 = "B19001C_012",
                                     aian_hhld_inc12 = "B19001C_013",
                                     aian_hhld_inc13 = "B19001C_014",
                                     aian_hhld_inc14 = "B19001C_015",
                                     aian_hhld_inc15 = "B19001C_016",
                                     aian_hhld_inc16 = "B19001C_017",
                                     as_hhld_inc = "B19001D_001",
                                     as_hhld_inc1 = "B19001D_002",
                                     as_hhld_inc2 = "B19001D_003",
                                     as_hhld_inc3 = "B19001D_004",
                                     as_hhld_inc4 = "B19001D_005",
                                     as_hhld_inc5 = "B19001D_006",
                                     as_hhld_inc6 = "B19001D_007",
                                     as_hhld_inc7 = "B19001D_008",
                                     as_hhld_inc8 = "B19001D_009",
                                     as_hhld_inc9 = "B19001D_010",
                                     as_hhld_inc10 = "B19001D_011",
                                     as_hhld_inc11 = "B19001D_012",
                                     as_hhld_inc12 = "B19001D_013",
                                     as_hhld_inc13 = "B19001D_014",
                                     as_hhld_inc14 = "B19001D_015",
                                     as_hhld_inc15 = "B19001D_016",
                                     as_hhld_inc16 = "B19001D_017",
                                     nhpi_hhld_inc = "B19001E_001",
                                     nhpi_hhld_inc1 = "B19001E_002",
                                     nhpi_hhld_inc2 = "B19001E_003",
                                     nhpi_hhld_inc3 = "B19001E_004",
                                     nhpi_hhld_inc4 = "B19001E_005",
                                     nhpi_hhld_inc5 = "B19001E_006",
                                     nhpi_hhld_inc6 = "B19001E_007",
                                     nhpi_hhld_inc7 = "B19001E_008",
                                     nhpi_hhld_inc8 = "B19001E_009",
                                     nhpi_hhld_inc9 = "B19001E_010",
                                     nhpi_hhld_inc10 = "B19001E_011",
                                     nhpi_hhld_inc11 = "B19001E_012",
                                     nhpi_hhld_inc12 = "B19001E_013",
                                     nhpi_hhld_inc13 = "B19001E_014",
                                     nhpi_hhld_inc14 = "B19001E_015",
                                     nhpi_hhld_inc15 = "B19001E_016",
                                     nhpi_hhld_inc16 = "B19001E_017",
                                     oth_hhld_inc = "B19001F_001",
                                     oth_hhld_inc1 = "B19001F_002",
                                     oth_hhld_inc2 = "B19001F_003",
                                     oth_hhld_inc3 = "B19001F_004",
                                     oth_hhld_inc4 = "B19001F_005",
                                     oth_hhld_inc5 = "B19001F_006",
                                     oth_hhld_inc6 = "B19001F_007",
                                     oth_hhld_inc7 = "B19001F_008",
                                     oth_hhld_inc8 = "B19001F_009",
                                     oth_hhld_inc9 = "B19001F_010",
                                     oth_hhld_inc10 = "B19001F_011",
                                     oth_hhld_inc11 = "B19001F_012",
                                     oth_hhld_inc12 = "B19001F_013",
                                     oth_hhld_inc13 = "B19001F_014",
                                     oth_hhld_inc14 = "B19001F_015",
                                     oth_hhld_inc15 = "B19001F_016",
                                     oth_hhld_inc16 = "B19001F_017",
                                     tw_hhld_inc = "B19001G_001",
                                     tw_hhld_inc1 = "B19001G_002",
                                     tw_hhld_inc2 = "B19001G_003",
                                     tw_hhld_inc3 = "B19001G_004",
                                     tw_hhld_inc4 = "B19001G_005",
                                     tw_hhld_inc5 = "B19001G_006",
                                     tw_hhld_inc6 = "B19001G_007",
                                     tw_hhld_inc7 = "B19001G_008",
                                     tw_hhld_inc8 = "B19001G_009",
                                     tw_hhld_inc9 = "B19001G_010",
                                     tw_hhld_inc10 = "B19001G_011",
                                     tw_hhld_inc11 = "B19001G_012",
                                     tw_hhld_inc12 = "B19001G_013",
                                     tw_hhld_inc13 = "B19001G_014",
                                     tw_hhld_inc14 = "B19001G_015",
                                     tw_hhld_inc15 = "B19001G_016",
                                     tw_hhld_inc16 = "B19001G_017",
                                     nhwh_hhld_inc = "B19001H_001",
                                     nhwh_hhld_inc1 = "B19001H_002",
                                     nhwh_hhld_inc2 = "B19001H_003",
                                     nhwh_hhld_inc3 = "B19001H_004",
                                     nhwh_hhld_inc4 = "B19001H_005",
                                     nhwh_hhld_inc5 = "B19001H_006",
                                     nhwh_hhld_inc6 = "B19001H_007",
                                     nhwh_hhld_inc7 = "B19001H_008",
                                     nhwh_hhld_inc8 = "B19001H_009",
                                     nhwh_hhld_inc9 = "B19001H_010",
                                     nhwh_hhld_inc10 = "B19001H_011",
                                     nhwh_hhld_inc11 = "B19001H_012",
                                     nhwh_hhld_inc12 = "B19001H_013",
                                     nhwh_hhld_inc13 = "B19001H_014",
                                     nhwh_hhld_inc14 = "B19001H_015",
                                     nhwh_hhld_inc15 = "B19001H_016",
                                     nhwh_hhld_inc16 = "B19001H_017",
                                     hl_hhld_inc = "B19001I_001",
                                     hl_hhld_inc1 = "B19001I_002",
                                     hl_hhld_inc2 = "B19001I_003",
                                     hl_hhld_inc3 = "B19001I_004",
                                     hl_hhld_inc4 = "B19001I_005",
                                     hl_hhld_inc5 = "B19001I_006",
                                     hl_hhld_inc6 = "B19001I_007",
                                     hl_hhld_inc7 = "B19001I_008",
                                     hl_hhld_inc8 = "B19001I_009",
                                     hl_hhld_inc9 = "B19001I_010",
                                     hl_hhld_inc10 = "B19001I_011",
                                     hl_hhld_inc11 = "B19001I_012",
                                     hl_hhld_inc12 = "B19001I_013",
                                     hl_hhld_inc13 = "B19001I_014",
                                     hl_hhld_inc14 = "B19001I_015",
                                     hl_hhld_inc15 = "B19001I_016",
                                     hl_hhld_inc16 = "B19001I_017"),
                       output = "wide",
                       year = 2022)

## process ## 
zip.2022.ses <- zip.2022.in %>% 
  select(GEOID,
         ends_with("E")) %>%
  rename(ZIP = GEOID) %>%
  rename_at(.vars = vars(ends_with("E")),
            .funs = funs(sub("E", "", .)))

##
## merge zip codes to msas ##
##

msas <- census.del.2020.final %>%
  filter(`Metropolitan/Micropolitan Statistical Area` == "Metropolitan Statistical Area") %>%
  select(`CBSA Code`) %>%
  distinct() %>%
  rename(CBSA = `CBSA Code`)

## restrict zip to msa xwalk to those in metro areas ##
load("001_zipmsa_xwalk.Rda")

zip.msa.xwalk.rd.m <- stata.merge(zip.msa.xwalk,
                                  msas,
                                  "CBSA")

## check merge ##
table(zip.msa.xwalk.rd.m$merge.variable, useNA = "ifany")

## keep matches ## 
zip.msa.xwalk.rd <- zip.msa.xwalk.rd.m %>%
  filter(merge.variable == 3)

# now, merge on CBSA codes to ZIP SES file ## 
zip.msa.2022.m <- stata.merge(zip.2022.ses,
                              zip.msa.xwalk.rd,
                              "ZIP")

## check merge ##
table(zip.msa.2022.m$merge.variable, useNA = "ifany")

## process ##
zip.2022.ses.pr <- zip.msa.2022.m %>%
  filter(merge.variable == 3) %>%
  select(-pop20) %>%
  mutate(afact_num = as.numeric(afact),
         median_hhld_inc_char = as.character(median_hhld_inc)) %>%
  select(ZIP,
         CBSA,
         CBSAName20,
         median_hhld_inc_char,
         afact_num,
         starts_with("pop"),
         starts_with("hhld_inc"),
         starts_with("aian_hhld"),
         starts_with("as_hhld"),
         starts_with("bl_hhld"),
         starts_with("hl_hhld"),
         starts_with("nhpi_hhld"),
         starts_with("nhwh_hhld"),
         starts_with("oth_hhld"),
         starts_with("tw_hhld")) %>%
  mutate_at(vars(pop_total:tw_hhld_inc16), ~ . *afact_num) %>%
  mutate(hhs_inc = rowSums(cbind(aian_hhld_inc,
                                 as_hhld_inc,
                                 bl_hhld_inc,
                                 nhpi_hhld_inc,
                                 hl_hhld_inc,
                                 oth_hhld_inc,
                                 tw_hhld_inc,
                                 nhwh_hhld_inc),na.rm=T)) %>%
  filter(hhs_inc > 0) %>%
  mutate(phi1 = hhld_inc1/hhld_inc,
         phi2 = hhld_inc2/hhld_inc,
         phi3 = hhld_inc3/hhld_inc,
         phi4 = hhld_inc4/hhld_inc,
         phi5 = hhld_inc5/hhld_inc,
         phi6 = hhld_inc6/hhld_inc,
         phi7 = hhld_inc7/hhld_inc,
         phi8 = hhld_inc8/hhld_inc,
         phi9 = hhld_inc9/hhld_inc,
         phi10 = hhld_inc10/hhld_inc,
         phi11 = hhld_inc11/hhld_inc,
         phi12 = hhld_inc12/hhld_inc,
         phi13 = hhld_inc13/hhld_inc,
         phi14 = hhld_inc14/hhld_inc,
         phi15 = hhld_inc15/hhld_inc,
         phi16 = hhld_inc16/hhld_inc,
         as_phi1 = as_hhld_inc1/hhs_inc,
         as_phi2 = as_hhld_inc2/hhs_inc,
         as_phi3 = as_hhld_inc3/hhs_inc,
         as_phi4 = as_hhld_inc4/hhs_inc,
         as_phi5 = as_hhld_inc5/hhs_inc,
         as_phi6 = as_hhld_inc6/hhs_inc,
         as_phi7 = as_hhld_inc7/hhs_inc,
         as_phi8 = as_hhld_inc8/hhs_inc,
         as_phi9 = as_hhld_inc9/hhs_inc,
         as_phi10 = as_hhld_inc10/hhs_inc,
         as_phi11 = as_hhld_inc11/hhs_inc,
         as_phi12 = as_hhld_inc12/hhs_inc,
         as_phi13 = as_hhld_inc13/hhs_inc,
         as_phi14 = as_hhld_inc14/hhs_inc,
         as_phi15 = as_hhld_inc15/hhs_inc,
         as_phi16 = as_hhld_inc16/hhs_inc,
         bl_phi1 = bl_hhld_inc1/hhs_inc,
         bl_phi2 = bl_hhld_inc2/hhs_inc,
         bl_phi3 = bl_hhld_inc3/hhs_inc,
         bl_phi4 = bl_hhld_inc4/hhs_inc,
         bl_phi5 = bl_hhld_inc5/hhs_inc,
         bl_phi6 = bl_hhld_inc6/hhs_inc,
         bl_phi7 = bl_hhld_inc7/hhs_inc,
         bl_phi8 = bl_hhld_inc8/hhs_inc,
         bl_phi9 = bl_hhld_inc9/hhs_inc,
         bl_phi10 = bl_hhld_inc10/hhs_inc,
         bl_phi11 = bl_hhld_inc11/hhs_inc,
         bl_phi12 = bl_hhld_inc12/hhs_inc,
         bl_phi13 = bl_hhld_inc13/hhs_inc,
         bl_phi14 = bl_hhld_inc14/hhs_inc,
         bl_phi15 = bl_hhld_inc15/hhs_inc,
         bl_phi16 = bl_hhld_inc16/hhs_inc,
         hl_phi1 = hl_hhld_inc1/hhs_inc,
         hl_phi2 = hl_hhld_inc2/hhs_inc,
         hl_phi3 = hl_hhld_inc3/hhs_inc,
         hl_phi4 = hl_hhld_inc4/hhs_inc,
         hl_phi5 = hl_hhld_inc5/hhs_inc,
         hl_phi6 = hl_hhld_inc6/hhs_inc,
         hl_phi7 = hl_hhld_inc7/hhs_inc,
         hl_phi8 = hl_hhld_inc8/hhs_inc,
         hl_phi9 = hl_hhld_inc9/hhs_inc,
         hl_phi10 = hl_hhld_inc10/hhs_inc,
         hl_phi11 = hl_hhld_inc11/hhs_inc,
         hl_phi12 = hl_hhld_inc12/hhs_inc,
         hl_phi13 = hl_hhld_inc13/hhs_inc,
         hl_phi14 = hl_hhld_inc14/hhs_inc,
         hl_phi15 = hl_hhld_inc15/hhs_inc,
         hl_phi16 = hl_hhld_inc16/hhs_inc,
         nhwh_phi1 = nhwh_hhld_inc1/hhs_inc,
         nhwh_phi2 = nhwh_hhld_inc2/hhs_inc,
         nhwh_phi3 = nhwh_hhld_inc3/hhs_inc,
         nhwh_phi4 = nhwh_hhld_inc4/hhs_inc,
         nhwh_phi5 = nhwh_hhld_inc5/hhs_inc,
         nhwh_phi6 = nhwh_hhld_inc6/hhs_inc,
         nhwh_phi7 = nhwh_hhld_inc7/hhs_inc,
         nhwh_phi8 = nhwh_hhld_inc8/hhs_inc,
         nhwh_phi9 = nhwh_hhld_inc9/hhs_inc,
         nhwh_phi10 = nhwh_hhld_inc10/hhs_inc,
         nhwh_phi11 = nhwh_hhld_inc11/hhs_inc,
         nhwh_phi12 = nhwh_hhld_inc12/hhs_inc,
         nhwh_phi13 = nhwh_hhld_inc13/hhs_inc,
         nhwh_phi14 = nhwh_hhld_inc14/hhs_inc,
         nhwh_phi15 = nhwh_hhld_inc15/hhs_inc,
         nhwh_phi16 = nhwh_hhld_inc16/hhs_inc,
         oth_phi1 = rowSums(cbind(oth_hhld_inc1 , nhpi_hhld_inc1 , aian_hhld_inc1 , tw_hhld_inc1),na.rm=T)/hhs_inc,
         oth_phi2 = rowSums(cbind(oth_hhld_inc2 , nhpi_hhld_inc2 , aian_hhld_inc2 , tw_hhld_inc2),na.rm=T)/hhs_inc,
         oth_phi3 = rowSums(cbind(oth_hhld_inc3 , nhpi_hhld_inc3 , aian_hhld_inc3 , tw_hhld_inc3),na.rm=T)/hhs_inc,
         oth_phi4 = rowSums(cbind(oth_hhld_inc4 , nhpi_hhld_inc4 , aian_hhld_inc4 , tw_hhld_inc4),na.rm=T)/hhs_inc,
         oth_phi5 = rowSums(cbind(oth_hhld_inc5 , nhpi_hhld_inc5 , aian_hhld_inc5 , tw_hhld_inc5),na.rm=T)/hhs_inc,
         oth_phi6 = rowSums(cbind(oth_hhld_inc6 , nhpi_hhld_inc6 , aian_hhld_inc6 , tw_hhld_inc6),na.rm=T)/hhs_inc,
         oth_phi7 = rowSums(cbind(oth_hhld_inc7 , nhpi_hhld_inc7 , aian_hhld_inc7 , tw_hhld_inc7),na.rm=T)/hhs_inc,
         oth_phi8 = rowSums(cbind(oth_hhld_inc8 , nhpi_hhld_inc8 , aian_hhld_inc8 , tw_hhld_inc8),na.rm=T)/hhs_inc,
         oth_phi9 = rowSums(cbind(oth_hhld_inc9 , nhpi_hhld_inc9 , aian_hhld_inc9 , tw_hhld_inc9),na.rm=T)/hhs_inc,
         oth_phi10 = rowSums(cbind(oth_hhld_inc10 , nhpi_hhld_inc10 , aian_hhld_inc10 , tw_hhld_inc10),na.rm=T)/hhs_inc,
         oth_phi11 = rowSums(cbind(oth_hhld_inc11 , nhpi_hhld_inc11 , aian_hhld_inc11 , tw_hhld_inc11),na.rm=T)/hhs_inc,
         oth_phi12 = rowSums(cbind(oth_hhld_inc12 , nhpi_hhld_inc12 , aian_hhld_inc12 , tw_hhld_inc12),na.rm=T)/hhs_inc,
         oth_phi13 = rowSums(cbind(oth_hhld_inc13 , nhpi_hhld_inc13 , aian_hhld_inc13 , tw_hhld_inc13),na.rm=T)/hhs_inc,
         oth_phi14 = rowSums(cbind(oth_hhld_inc14 , nhpi_hhld_inc14 , aian_hhld_inc14 , tw_hhld_inc14),na.rm=T)/hhs_inc,
         oth_phi15 = rowSums(cbind(oth_hhld_inc15 , nhpi_hhld_inc15 , aian_hhld_inc15 , tw_hhld_inc15),na.rm=T)/hhs_inc,
         oth_phi16 = rowSums(cbind(oth_hhld_inc16 , nhpi_hhld_inc16 , aian_hhld_inc16 , tw_hhld_inc16),na.rm=T)/hhs_inc,
         per_aian = ifelse(pop_total > 0, pop_nh_aian/pop_total,0),
         per_asian = ifelse(pop_total > 0, pop_nh_asian/pop_total,0),
         per_black = ifelse(pop_total > 0, pop_nh_black/pop_total,0),
         per_hl = ifelse(pop_total > 0, pop_h/pop_total,0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T)/pop_total, 
                               pop_total == 0 ~ 0),
         per_white = ifelse(pop_total > 0, pop_nh_white/pop_total,0),
         log_asian = case_when(pop_nh_asian != 0 ~ log(1/per_asian),
                               pop_nh_asian == 0 ~ 0),
         log_black = case_when(pop_nh_black != 0 ~ log(1/per_black),
                               pop_nh_black == 0 ~ 0),
         log_hl = case_when(pop_h != 0 ~ log(1/per_hl),
                                pop_h == 0 ~ 0),
         log_white = case_when(pop_nh_white != 0 ~ log(1/per_white),
                               pop_nh_white == 0 ~ 0),
         log_aian = case_when(pop_nh_aian != 0 ~ log(1/per_aian),
                              pop_nh_aian == 0 ~ 0),
         log_other = case_when(rowSums(cbind(pop_nh_other, pop_nh_nhpi, pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other),
                               rowSums(cbind(pop_nh_other, pop_nh_nhpi, pop_nh_multi),na.rm=T) == 0 ~ 0),
         trt_entropy_er = per_asian*log_asian + 
                          per_black*log_black + 
                          per_hl*log_hl + 
                          per_white*log_white +
                          per_aian*log_aian +
                          per_other*log_other,
         trt_entropy_ers = (trt_entropy_er - min(trt_entropy_er))/(max(trt_entropy_er)-min(trt_entropy_er)))

paste("ZIP LEVEL SES ENTROPY")
summary(zip.2022.ses.pr$trt_entropy_er)
paste("ZIP LEVEL SES ENTROPY (SCALED)")
summary(zip.2022.ses.pr$trt_entropy_ers)

##
## now, bring on msa median household income ##
##

load("002_agg_msa_2020.Rda")

agg.msa.mhi.2020 <- agg.msa.2020 %>%
  select(CBSA,
         median_hhld_inc_msa)

## pre merge checks ##

paste("IS ZIP SES FILE UNIQUE BY ZIP-CBSA?")
length(unique(paste(zip.2022.ses.pr$ZIP, 
                    zip.2022.ses.pr$CBSA, sep = "-"))) == nrow(zip.2022.ses.pr)

## merge files ##
zip.2022.ses.msa.m <- stata.merge(zip.2022.ses.pr,
                                  agg.msa.mhi.2020,
                                  "CBSA")

## check merge ##
paste("MERGE BETWEEN ZIP SES AND MSA MHI FILE")
table(zip.2022.ses.msa.m$merge.variable, useNA = "ifany")

## keep matches ##
zip.2022.ses.msa <- zip.2022.ses.msa.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## calculate entropy by SES ##
##

## prep data ##
zip.2022.incl <- zip.2022.ses.msa %>%
  mutate(vlil = 0.5*median_hhld_inc_msa,
         lil = 0.8*median_hhld_inc_msa,
         mil = median_hhld_inc_msa,
         hmil = 1.2*median_hhld_inc_msa,
         hil = 1.5*median_hhld_inc_msa) %>%
  rename(GEOID = ZIP)

## keep only necessary vars ##
zip.2022.ses.kvars <- zip.2022.incl %>%
  select(GEOID,
         CBSA,
         starts_with("phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil)

## check for missing variables ##
paste("SUMMARY OF ZIP SES FILE")
summary(zip.2022.ses.kvars)
paste("MISSINGS IN ZIP SES FILE")
sapply(zip.2022.ses.kvars, function(x) sum(is.na(x)))

## input data into function that calculates distribution by HUD income limits ##
pp.zip.2022.ses <- as.data.frame(lapply(zip.2022.ses.kvars[,19:23], hud_group_zip, indata=zip.2022.ses.kvars))

## input resulting data into cleaning function ##
pp.zip.2022.ses.fin <- rd.data.zip(pp.zip.2022.ses)

## test ## 
t.ses.2022 <- pp.zip.2022.ses.fin %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK THAT SES SHARES WERE CREATED CORRECTLY")
summary(t.ses.2022$sum)

## calculate SES entropy ##
zip.2022.ses.entropy <- pp.zip.2022.ses.fin %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_ses = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         CBSA,
         trt_entropy_ses)

paste("ZIP SES ENTROPY")
summary(zip.2022.ses.entropy$trt_entropy_ses)

## merge SES entropy onto working data ##
zip.2022.ses.entropy.cb.m <- stata.merge(pp.zip.2022.ses.fin,
                                         zip.2022.ses.entropy,
                                         c("GEOID","CBSA"))

## check merge ## 
paste("MERGE BETWEEN ZIP SES FILE AND SES ENTROPY OUTPUT")
table(zip.2022.ses.entropy.cb.m$merge.variable, useNA = "ifany")

## keep matches ##
zip.2022.ses.entropy.fin <- zip.2022.ses.entropy.cb.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## determine majority/plurality SES group ##
##

zip.2022.ses.entropy.fin$maj_ses <- c(colnames(zip.2022.ses.entropy.fin[,c("linc",
                                                                           "minc",
                                                                           "hinc")])[apply(zip.2022.ses.entropy.fin[,c("linc",
                                                                                                                       "minc",
                                                                                                                       "hinc")],
                                                                                       1,which.max)])


##
## step 3: now, create joint entropy ##
##

zip.2022.pas.joint <- zip.2022.incl %>%
  select(GEOID,
         CBSA,
         starts_with("as_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^as_",""))

sapply(zip.2022.pas.joint, function(x) sum(is.na(x)))
zip.2022.pbl.joint <- zip.2022.incl %>%
  select(GEOID,
         CBSA,
         starts_with("bl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^bl_",""))

sapply(zip.2022.pbl.joint, function(x) sum(is.na(x)))

zip.2022.phl.joint <- zip.2022.incl %>%
  select(GEOID,
         CBSA,
         starts_with("hl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^hl_",""))

sapply(zip.2022.phl.joint, function(x) sum(is.na(x)))

zip.2022.pwh.joint <- zip.2022.incl %>%
  select(GEOID,
         CBSA,
         starts_with("nhwh_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^nhwh_","")) 

sapply(zip.2022.pwh.joint, function(x) sum(is.na(x)))


zip.2022.poth.joint <- zip.2022.incl %>%
  select(GEOID,
         CBSA,
         starts_with("oth_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^oth_","")) 

sapply(zip.2022.poth.joint, function(x) sum(is.na(x)))

zip.2022.aspp.joint <- as.data.frame(lapply(zip.2022.pas.joint[,19:23], hud_group_zip, indata=zip.2022.pas.joint))
zip.2022.blpp.joint <- as.data.frame(lapply(zip.2022.pbl.joint[,19:23], hud_group_zip, indata=zip.2022.pbl.joint))
zip.2022.hlpp.joint <- as.data.frame(lapply(zip.2022.phl.joint[,19:23], hud_group_zip, indata=zip.2022.phl.joint))
zip.2022.whpp.joint <- as.data.frame(lapply(zip.2022.pwh.joint[,19:23], hud_group_zip, indata=zip.2022.pwh.joint))
zip.2022.othpp.joint <- as.data.frame(lapply(zip.2022.poth.joint[,19:23], hud_group_zip, indata=zip.2022.poth.joint))

## 
## collect the output
##

zip.2022.joint.ent.dfs <- list(zip.2022.aspp.joint,
                               zip.2022.blpp.joint,
                               zip.2022.hlpp.joint,
                               zip.2022.whpp.joint,
                               zip.2022.othpp.joint)
  
## process ##
zip.2022.joint.ent.dfs.res <- lapply(zip.2022.joint.ent.dfs, rd.data.zip)

zip.2022.as.fin.joint <- zip.2022.joint.ent.dfs.res[[1]] %>%
  rename_with(~paste0(., "_as"), linc:hinc)

zip.2022.bl.fin.joint <- zip.2022.joint.ent.dfs.res[[2]] %>%
  rename_with(~paste0(., "_bl"), linc:hinc)

zip.2022.lx.fin.joint <- zip.2022.joint.ent.dfs.res[[3]] %>%
  rename_with(~paste0(., "_lx"), linc:hinc)

zip.2022.wh.fin.joint <- zip.2022.joint.ent.dfs.res[[4]] %>%
  rename_with(~paste0(., "_wh"), linc:hinc)

zip.2022.oth.fin.joint <- zip.2022.joint.ent.dfs.res[[5]] %>%
  rename_with(~paste0(., "_oth"), linc:hinc)

## combine ##
zip.2022.all.joint.data <- list(zip.2022.as.fin.joint,
                                zip.2022.bl.fin.joint,
                                zip.2022.lx.fin.joint,
                                zip.2022.wh.fin.joint,
                                zip.2022.oth.fin.joint)

zip.2022.all.joint <- zip.2022.all.joint.data %>%
  reduce(full_join, by=c("GEOID","CBSA")) 

## test ## 
t.ses.jt.2022 <- zip.2022.all.joint %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK TO SEE IF JT SHARES WERE CREATED CORRECTLY")
summary(t.ses.jt.2022$sum)

##
## create joint entropy measure ## 
##

zip.2022.all.joint.final <- zip.2022.all.joint %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_joint = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         CBSA,
         trt_entropy_joint)

paste("JT ENTROPY MEASURE")
summary(zip.2022.all.joint.final$trt_entropy_joint)

##
## step 4: calculate MSA rel vars from tract level data ## 
##

## aggregate tract file to MSA level ##
m.trts.2020.m <- stata.merge(all.tracts.2020.fzip,
                             fips.to.msas,
                             "FIPS")

## diagnose merge ##
paste("MERGE BETWEEN 2020 TRACT FILE AND FIPS TO MSA XWALK")
table(m.trts.2020.m$merge.variable, useNA = "ifany")

## keep only metro tracts ## 
m.trts.2020 <- m.trts.2020.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                               pop_total == 0 ~ 0),
         per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                               pop_total == 0 ~ 0),
         per_hl = case_when(pop_total != 0 ~ pop_hl/pop_total,
                                pop_total == 0 ~ 0),
         per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                               pop_total == 0 ~ 0),
         per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                              pop_total == 0 ~ 0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other ,pop_nh_nhpi ,pop_nh_multi),na.rm=T)/pop_total, 
                               pop_total == 0 ~ 0))

## now, merge MSA median hhld inc data onto metro tracts ##
mtrts.2020.wmsa.m <- stata.merge(m.trts.2020,
                                 agg.msa.2020,
                                 "CBSA")

## check merge ##
paste("MERGE BETWEEN METRO TRACT FILE AND MSA FILE")
table(mtrts.2020.wmsa.m$merge.variable)

## keep matches, create key vars ##
mtrts.2020.fin <- mtrts.2020.wmsa.m %>%
  filter(merge.variable == 3) %>%
  mutate(D_pre_comp_aian = case_when(per_aian !=0 & per_aian_msa != 0 ~ log(per_aian/per_aian_msa),
                                     per_aian == 0 | per_aian_msa == 0 ~ 0),
         D_pre_comp_asian = case_when(per_asian !=0 &  per_asian_msa != 0 ~ log(per_asian/per_asian_msa),
                                      per_asian == 0 | per_asian_msa == 0 ~ 0),
         D_pre_comp_black = case_when(per_black !=0 & per_black_msa != 0 ~ log(per_black/per_black_msa),
                                      per_black == 0 | per_black_msa == 0 ~ 0),
         D_pre_comp_hl = case_when(per_hl !=0 &  per_hl_msa != 0 ~ log(per_hl/per_hl_msa),
                                       per_hl == 0 | per_hl_msa == 0 ~ 0),
         D_pre_comp_other = case_when(per_other !=0 & per_other_msa != 0 ~ log(per_other/per_other_msa),
                                      per_other == 0 | per_other_msa == 0 ~ 0),
         D_pre_comp_white = case_when(per_white !=0 &  per_white_msa != 0 ~ log(per_white/per_white_msa),
                                      per_white == 0 | per_white_msa == 0 ~ 0),
         D_comp_aian = per_aian * D_pre_comp_aian,
         D_comp_asian = per_asian * D_pre_comp_asian,
         D_comp_black = per_black * D_pre_comp_black,
         D_comp_hl = per_hl * D_pre_comp_hl,
         D_comp_other = per_other * D_pre_comp_other,
         D_comp_white = per_white * D_pre_comp_white,
         D_comp_all = rowSums(cbind(D_comp_aian,D_comp_asian,D_comp_black,D_comp_hl,D_comp_other,D_comp_white),na.rm=T),
         mhi_ratio = median_hhld_inc/median_hhld_inc_msa)

##
## create ER AND SES seg measures ##
##

mtrts.2020.fin$D_int <- ifelse(mtrts.2020.fin$D_comp_all < median(mtrts.2020.fin$D_comp_all,na.rm=T) + 
                               sd(mtrts.2020.fin$D_comp_all,na.rm=T), 1, 0) 

mtrts.2020.fin$mhi_int <- ifelse(mtrts.2020.fin$mhi_ratio >= 0.5 & 
                                 mtrts.2020.fin$mhi_ratio <= 2, 1, 0)

paste("BREAKDOWN OF D_INT")
prop.table(table(mtrts.2020.fin$D_int))
paste("BREAKDOWN OF MHI_INT")
prop.table(table(mtrts.2020.fin$mhi_int))

## now aggregate to zip level ##
msa.vars.zip.m <- stata.merge(mtrts.2020.fin,
                              ziptrt.xwalk.in,
                              "GEOID") 

## check merge ##
table("MERGE BETWEEN TRACT WITH INT VARS AND ZIP TRACT XWALK")
table(msa.vars.zip.m$merge.variable, useNA = "ifany")

## keep matches ##
msa.vars.zip <- msa.vars.zip.m %>%
  filter(merge.variable == 3) %>%
  mutate(D_int_wt = D_int * RES_RATIO,
         mhi_int_wt = mhi_int * RES_RATIO) %>%
  group_by(ZIP) %>%
  summarize(D_int_share = sum(D_int_wt,na.rm=T),
            mhi_int_share = sum(mhi_int_wt,na.rm=T)) %>%
  rename(GEOID = ZIP)

paste("SUMMARY OF D_INT IN RESULTING ZIP LEVEL FILE")
summary(msa.vars.zip$D_int_share)
paste("SUMMARY OF MHI_INT IN RESULTING ZIP LEVEL FILE")
summary(msa.vars.zip$mhi_int_share)

##
## step 4: bring it all together ## 
##

## merge ses and joint entropy ##
zip.all.m1 <- stata.merge(zip.2022.ses.entropy.fin,
                          zip.2022.all.joint.final,
                          c("GEOID","CBSA"))

## check merge ##
paste("MERGE BETWEEN SES ENTROPY AND JT ENTROPY FILES")
table(zip.all.m1$merge.variable, useNA = "ifany")

## keep matches ##
zip.all.m1.use <- zip.all.m1 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## now, bring on zip level population info ##
## need pop weights to get zip-cbsa data to zip level ##

zip.2022.ses.msa.rd <- zip.2022.ses.msa %>%
  select(ZIP,
         CBSA,
         pop_total) %>%
  rename(GEOID=ZIP) %>%
  select(GEOID,
         CBSA,
         pop_total)

## merge files ##
zip.all.m2 <- stata.merge(zip.all.m1.use,
                          zip.2022.ses.msa.rd,
                          c("GEOID","CBSA"))

## check merge ##
paste("MERGE BETWEEN ZIP-CBSA ENTROPY FILE AND ZIP POP FILE")
table(zip.all.m2$merge.variable, useNA = "ifany")

##
## aggregate zip-cbsa data to zip level ##
##

agg.vars <- c("trt_entropy_ses",
              "trt_entropy_joint",
              "linc",
              "minc",
              "hinc")

zip.all.m2.use <- zip.all.m2 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  group_by(GEOID) %>%
  mutate(pop_wt = pop_total/sum(pop_total,na.rm=T)) %>%
  ungroup() %>%
  mutate_at(agg.vars, ~ . *pop_wt) %>%
  group_by(GEOID) %>%
  summarize(#median_hhld_inc_wt = sum(median_hhld_inc,na.rm=T),
            trt_entropy_ses_wt = sum(trt_entropy_ses,na.rm=T),
            trt_entropy_joint_wt = sum(trt_entropy_joint,na.rm=T),
            linc_wt = sum(linc,na.rm=T),
            minc_wt = sum(minc,na.rm=T),
            hinc_wt = sum(hinc,na.rm=T))

##
## now, bring on race/ethnicity data ##
##

zip.all.m3 <- stata.merge(zip.all.m2.use,
                          zip.2020.er,
                          "GEOID")

## check merge ##
paste("MERGE BETWEEN ZIP ENTROPY FILE AND ZIP ER DATA")
table(zip.all.m3$merge.variable, useNA = "ifany")

zip.all.m3.use <- zip.all.m3 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## get median hhld inc and hhld pov rt ##
##

zip.oth.ses <- zip.2022.ses %>%
  select(ZIP,
         median_hhld_inc,
         hhld_pov_num,
         hhld_pov_den) %>%
  mutate(hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den,0)) %>%
  rename(GEOID = ZIP) %>%
  select(GEOID,
         median_hhld_inc,
         hhld_pov_rt)

## merge files ## 
zip.all.m4 <- stata.merge(zip.all.m3.use,
                          zip.oth.ses,
                          "GEOID")

## check merge ##
paste("MERGE BETWEEN ZIP FILE WITH ER AND ZIP SES FILE")
table(zip.all.m4$merge.variable, useNA = "ifany")

## keep matches ## 
zip.all.m4.use <- zip.all.m4 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## bring on rel race/ethnicity and ses vars ##
##

zip.all.m5 <- stata.merge(zip.all.m4.use,
                          msa.vars.zip,
                          "GEOID")

## check merge ##
paste("MERGE BETWEEN ZIP ER/SES FILE AND ZIP FILE WITH REL MEASURES")
table(zip.all.m5$merge.variable, useNA = "ifany")

## keep matches ##
zip.all.m5.use <- zip.all.m5 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## step 5:  final cleaning ##
##

zip.all.fin <- zip.all.m5.use %>%
  mutate(trt_entropy_ers = (trt_entropy_er - min(trt_entropy_er))/(max(trt_entropy_er)-min(trt_entropy_er)),
         hd_er = ifelse(trt_entropy_ers >= 0.7414 &
                          per_asian < 0.45 & 
                          per_black < 0.45 & 
                          per_hl < 0.45 & 
                          per_white < 0.45 & 
                          per_aian < 0.45 &
                          per_other < 0.45, 1, 0),
         ld_er = ifelse(trt_entropy_ers <= 0.3707 |
                          per_asian > 0.8 |
                          per_black > 0.8 |
                          per_hl > 0.8 |
                          per_white > 0.8 |
                          per_aian > 0.8 |
                          per_other > 0.8, 1, 0),
         md_er = ifelse(hd_er == 0 & 
                        ld_er == 0, 1, 0),
         trt_entropy_sess = (trt_entropy_ses_wt - min(trt_entropy_ses_wt))/(max(trt_entropy_ses_wt)-min(trt_entropy_ses_wt)),
         trt_entropy_joints = (trt_entropy_joint_wt - min(trt_entropy_joint_wt))/(max(trt_entropy_joint_wt)-min(trt_entropy_joint_wt)),
         hd_ses = ifelse(linc_wt < 0.4 & 
                         minc_wt < 0.4 & 
                         hinc_wt < 0.4, 1, 0),
         ld_ses = ifelse(linc_wt > (2/3) | 
                         minc_wt > (2/3) | 
                         hinc_wt > (2/3),1,0),
         md_ses = ifelse((linc_wt >= 0.4 |
                            minc_wt >= 0.4 | 
                            hinc_wt >= 0.4) & 
                           (linc_wt <= (2/3) & 
                            minc_wt <= (2/3) & 
                            hinc_wt <= (2/3)), 1, 0),
         hd_jt = ifelse(hd_er ==1 & hd_ses == 1, 1, 0), 
         ld_jt = ifelse(ld_er==1 | ld_ses == 1, 1, 0), 
         md_jt = ifelse(hd_jt==0 & ld_jt == 0, 1, 0),
         div_er = ifelse(hd_er == 1 | md_er == 1, 1, 0),
         int_er_abs = ifelse(md_er == 1 | hd_er == 1, 1, 0),
         int_er_rel = ifelse((md_er == 1 | hd_er == 1) & D_int_share > 0.75, 1, 0),
         int_er_relf = ifelse((md_er == 1 | hd_er == 1) & D_int_share > 0.75 & 
                                rowSums(cbind(per_black,per_hl), na.rm=T) >=0.2, 1, 0),
         int_ses = ifelse(hd_ses == 1 | md_ses == 1, 1, 0),
         int_ses_rel = ifelse(int_ses == 1 & 
                              mhi_int_share > 0.75 & 
                              hhld_pov_rt < 0.4, 1, 0),
         int_jt_abs = ifelse(int_er_abs == 1 & int_ses == 1, 1, 0),
         int_jt_rel = ifelse(int_er_rel == 1 & int_ses_rel == 1, 1, 0),
         int_jt_relf = ifelse(int_er_relf == 1 & int_ses_rel == 1, 1, 0)) %>%
  rename(ZIP = GEOID)


## diagnostics ##
paste("SUMMARY OF FINAL ZIP FILE")
summary(zip.all.fin)

paste("SUMMARY OF 2020 ER ENTROPY IN FINAL ZIP FILE")
summary(zip.all.fin$trt_entropy_er)
paste("SUMMARY OF 2020 SCALED ER ENTROPY IN FINAL ZIP FILE")
summary(zip.all.fin$trt_entropy_ers)

paste("SUMMARY OF 2020 SES ENTROPY IN FINAL ZIP FILE")
summary(zip.all.fin$trt_entropy_ses_wt)
paste("SUMMARY OF 2020 SCALED SES ENTROPY IN FINAL ZIP FILE")
summary(zip.all.fin$trt_entropy_sess)

paste("SUMMARY OF 2020 JT ENTROPY IN FINAL ZIP FILE")
summary(zip.all.fin$trt_entropy_joint_wt)
paste("SUMMARY OF 2020 SCALED JT ENTROPY IN FINAL ZIP FILE")
summary(zip.all.fin$trt_entropy_joints)

## compare with analytic file (2020) ##

load("002_af_wide_final.Rda")

## overall ethnoraical breakdown ##
paste("BREAKDOWN OF 2020 INT ER ABS, REL, AND RELF IN ZIP FILE")
prop.table(table(zip.all.fin$int_er_abs))
prop.table(table(zip.all.fin$int_er_rel))
prop.table(table(zip.all.fin$int_er_relf))

paste("BREAKDOWN OF 2020 INT ER ABS, REL, AND RELF IN ANALYTIC FILE")
prop.table(table(af.wide.final$int_er_abs_2020))
prop.table(table(af.wide.final$int_er_rel_2020))
prop.table(table(af.wide.final$int_er_relf_2020))

## overall ses breakdown ##
paste("BREAKDOWN OF 2020 SES INT ABS AND REL IN ZIP FILE")
prop.table(table(zip.all.fin$int_ses))
prop.table(table(zip.all.fin$int_ses_rel))

paste("BREAKDOWN OF 2020 SES INT ABS AND REL IN ANALYTIC FILE")
prop.table(table(af.wide.final$int_ses_2020))
prop.table(table(af.wide.final$int_ses_rel_2020))

## overall joint breakdown ##
paste("BREAKDOWN OF 2020 JT INT ABS, REL, AND RELF IN ZIP FILE")
prop.table(table(zip.all.fin$int_jt_abs))
prop.table(table(zip.all.fin$int_jt_rel))
prop.table(table(zip.all.fin$int_jt_relf))

paste("BREAKDOWN OF 2020 JT INT ABS, REL, AND RELF IN ANALYTIC FILE")
prop.table(table(af.wide.final$int_jt_abs_2020))
prop.table(table(af.wide.final$int_jt_rel_a_2020))
prop.table(table(af.wide.final$int_jt_relf_a_2020))

##
## step 6: bring on social capital data ##
##

load("001_sc.Rda")

zip.all.sc.m <- stata.merge(zip.all.fin,
                            sc.in,
                            "ZIP")

## check merge ##
paste("MERGE BETWEEN FINAL ZIP FILE AND SOCIAL CAPITAL DATA")
table(zip.all.sc.m$merge.variable, useNA = "ifany")
prop.table(table(zip.all.sc.m$merge.variable, useNA = "ifany"))

## keep matches ##
zip.all.sc <- zip.all.sc.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## calculate majority/plurality ethnoracial group ##
##

zip.all.sc$maj_race <- c(colnames(zip.all.sc[,c("per_asian",
                                                "per_black",
                                                "per_hl",
                                                "per_white",
                                                "per_aian",
                                                "per_other")])[apply(zip.all.sc[,c("per_asian",
                                                                                   "per_black",
                                                                                   "per_hl",
                                                                                   "per_white",
                                                                                   "per_aian",
                                                                                   "per_other")],
                                                                                   1,which.max)])


##
## calculate majority/plurality income group ##
##

zip.all.sc$maj_ses <- c(colnames(zip.all.sc[,c("linc_wt",
                                               "minc_wt",
                                               "hinc_wt")])[apply(zip.all.sc[,c("linc_wt",
                                                                                "minc_wt",
                                                                                "hinc_wt")],
                                                                                1,which.max)])

## combine the ER and SES groups into one variable ##
zip.all.sc$maj_race_ses <- paste(zip.all.sc$maj_race,
                                 zip.all.sc$maj_ses,
                                 sep = " ")

## diagnostics ## 
paste("BREAKDOWN OF 2020 ER INT ABS, REL, AND RELF IN ANALYTIC FILE")
prop.table(table(af.wide.final$int_er_abs_2020, useNA = "ifany"))
prop.table(table(af.wide.final$int_er_rel_2020, useNA = "ifany"))
prop.table(table(af.wide.final$int_er_relf_2020, useNA = "ifany"))

paste("BREAKDOWN OF 2020 ER INT ABS, REL, AND RELF IN ZIP SC FILE")
prop.table(table(zip.all.sc$int_er_abs, useNA = "ifany"))
prop.table(table(zip.all.sc$int_er_rel, useNA = "ifany"))
prop.table(table(zip.all.sc$int_er_relf, useNA = "ifany"))

paste("BREAKDOWN OF 2020 SES INT ABS AND REL IN ANALYTIC FILE")
prop.table(table(af.wide.final$int_ses_2020, useNA = "ifany"))
prop.table(table(af.wide.final$int_ses_rel_a_2020, useNA = "ifany"))

paste("BREAKDOWN OF 2020 SES INT ABS AND REL IN ANALYTIC FILE")
prop.table(table(zip.all.sc$int_ses, useNA = "ifany"))
prop.table(table(zip.all.sc$int_ses_rel, useNA = "ifany"))

paste("BREAKDOWN OF 2020 JT INT RELF IN ANALYTIC FILE")
prop.table(table(af.wide.final$int_jt_relf_a_2020))
paste("BREAKDOWN OF 2020 JT INT RELF IN ZIP SC FILE")
prop.table(table(zip.all.sc$int_jt_relf, useNA = "ifany"))

## save file ## 
#save(zip.all.sc,
#     file = "009_zip_all_sc.Rda")

##
## stats ##
##

paste("MEAN LEVEL OF ECONOMIC CONNECTEDNESS ACROSS SAMPLE")
summary(zip.all.sc$nbhd_ec_zip)
paste("MEAN LEVEL OF EXPOSURE ACROSS SAMPLE")
summary(zip.all.sc$exposure_grp_mem_zip)
paste("MEAN LEVEL OF FRIENDING BIAS ACROSS SAMPLE")
summary(zip.all.sc$bias_grp_mem_zip)

paste("SUMMARY OF EXPOSURE IN SES SEG NEIGHBORHOODS")
summary(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_ses_rel == 0])
paste("SUMMARY OF EXPOSURE IN SES INT NEIGHBORHOODS")
summary(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_ses_rel == 1])

paste("SUMMARY OF FRIENDING BIAS IN SES SEG NEIGHBORHOODS")
summary(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_ses == 0])
paste("SUMMARY OF FRIENDING BIAS IN SES SEG REL NEIGHBORHOODS")
summary(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_ses_rel == 0])
paste("SUMMARY OF FRIENDING BIAS IN SES INT NEIGHBORHOODS")
summary(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_ses == 1])
paste("SUMMARY OF FRIENDING BIAS IN SES INT REL NEIGHBORHOODS")
summary(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_ses_rel == 1])

##
## density plots
##

## exposure ##
int.ses.rel1 <- zip.all.sc %>%
  filter(int_ses_rel == 1 & 
           !is.na(exposure_grp_mem_zip)) %>%
  mutate(gr = "int") %>%
  select(gr,
         exposure_grp_mem_zip)

seg.ses.rel1 <- zip.all.sc %>%
  filter(int_ses_rel == 0 & 
           !is.na(exposure_grp_mem_zip)) %>%
  mutate(gr = "seg") %>%
  select(gr,
         exposure_grp_mem_zip)

ses.rel1 <- rbind(int.ses.rel1,
                  seg.ses.rel1)

ggplot(ses.rel1, aes(x=exposure_grp_mem_zip, fill=gr)) + 
  geom_density(alpha=0.25)

## friending bias ##

int.ses.rel2 <- zip.all.sc %>%
  filter(int_ses_rel == 1 & 
           !is.na(bias_grp_mem_zip)) %>%
  mutate(gr = "int") %>%
  select(gr,
         bias_grp_mem_zip)

seg.ses.rel2 <- zip.all.sc %>%
  filter(int_ses_rel == 0 & 
           !is.na(bias_grp_mem_zip)) %>%
  mutate(gr = "seg") %>%
  select(gr,
         bias_grp_mem_zip)

ses.rel2 <- rbind(int.ses.rel2,
                  seg.ses.rel2)

ggplot(ses.rel2, aes(x=bias_grp_mem_zip, fill=gr)) + 
  geom_density(alpha=0.25) 

##
## bar plots ##
##

## economic connectedness ##

ec.pl <- data.frame(gr = c("seg",
                           "int",
                            "seg",
                            "int",
                            "seg",
                            "int"),
                    cat = c("Race/ethnicity",
                            "Race/ethnicity",
                            "SES",
                            "SES",
                            "Joint",
                            "Joint"),
                    val = c(mean(zip.all.sc$nbhd_ec_zip[zip.all.sc$int_er_relf==0], na.rm=T),
                            mean(zip.all.sc$nbhd_ec_zip[zip.all.sc$int_er_relf==1], na.rm=T),
                            mean(zip.all.sc$nbhd_ec_zip[zip.all.sc$int_ses_rel==0], na.rm=T),
                            mean(zip.all.sc$nbhd_ec_zip[zip.all.sc$int_ses_rel==1], na.rm=T),
                            mean(zip.all.sc$nbhd_ec_zip[zip.all.sc$int_jt_relf==0], na.rm=T),
                            mean(zip.all.sc$nbhd_ec_zip[zip.all.sc$int_jt_relf==1], na.rm=T)))

ec.pl$cat_f <- factor(ec.pl$cat, 
                      levels = c("Race/ethnicity",
                                 "SES",
                                 "Joint"))

## figure 6 ## 
ggplot(ec.pl, aes(fill= gr, y=val, x=cat_f)) + 
  geom_bar(position="dodge", stat="identity") + 
  scale_fill_manual(name = "Joint integration",
                    breaks = c("seg","int"), 
                    labels = c("Segregated","Integrated"),
                    values=c("purple4", "gold")) +
  xlab("") + 
  ylab("") + 
  theme_minimal()

## exposure ##

ex.pl <- data.frame(gr = c("seg",
                           "int",
                           "seg",
                           "int",
                           "seg",
                           "int"),
                    cat = c("Race/ethnicity",
                            "Race/ethnicity",
                            "SES",
                            "SES",
                            "Joint",
                            "Joint"),
                    val = c(mean(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_er_relf==0], na.rm=T),
                            mean(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_er_relf==1], na.rm=T),
                            mean(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_ses_rel==0], na.rm=T),
                            mean(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_ses_rel==1], na.rm=T),
                            mean(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_jt_relf==0], na.rm=T),
                            mean(zip.all.sc$exposure_grp_mem_zip[zip.all.sc$int_jt_relf==1], na.rm=T)))

ex.pl$cat_f <- factor(ex.pl$cat, 
                      levels = c("Race/ethnicity",
                                 "SES",
                                 "Joint"))

## figure 7a ##
ggplot(ex.pl, aes(fill= gr, y=val, x=cat_f)) + 
  geom_bar(position="dodge", stat="identity") + 
  scale_fill_manual(name = "Joint integration",
                    breaks = c("seg","int"), 
                    labels = c("Segregated","Integrated"),
                    values=c("purple4", "gold")) +
  xlab("") + 
  ylab("") + 
  theme_minimal()


## friending bias ##

fb.pl <- data.frame(gr = c("seg",
                           "int",
                           "seg",
                           "int",
                           "seg",
                           "int"),
                    cat = c("Race/ethnicity",
                            "Race/ethnicity",
                            "SES",
                            "SES",
                            "Joint",
                            "Joint"),
                    val = c(mean(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_er_relf==0], na.rm=T),
                            mean(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_er_relf==1], na.rm=T),
                            mean(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_ses_rel==0], na.rm=T),
                            mean(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_ses_rel==1], na.rm=T),
                            mean(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_jt_relf==0], na.rm=T),
                            mean(zip.all.sc$bias_grp_mem_zip[zip.all.sc$int_jt_relf==1], na.rm=T)))

fb.pl$cat_f <- factor(fb.pl$cat, 
                      levels = c("Race/ethnicity",
                                 "SES",
                                 "Joint"))

## figure 7b ##
ggplot(fb.pl, aes(fill= gr, y=val, x=cat_f)) + 
  geom_bar(position="dodge", stat="identity") + 
  scale_fill_manual(name = "Joint integration",
                    breaks = c("seg","int"), 
                    labels = c("Segregated","Integrated"),
                    values=c("purple4", "gold")) +
  xlab("") + 
  ylab("") + 
  theme_minimal()

##
## what are the characteristics of integrated neighborhood with high levels of social capital? ##
##

## profiles ## 

## integrated and above-average social capital ##

int.jt.conn.a <- zip.all.sc %>%
  filter(int_jt_relf == 1 & 
         nbhd_ec_zip > median(nbhd_ec_zip, na.rm=T))

prop.table(table(int.jt.conn.a$maj_race, useNA = "ifany"))
prop.table(table(int.jt.conn.a$maj_ses, useNA = "ifany"))
prop.table(table(int.jt.conn.a$maj_race_ses, useNA = "ifany"))

summary(int.jt.conn.a$median_hhld_inc)
summary(int.jt.conn.a$hhld_pov_rt)
summary(int.jt.conn.a$exposure_grp_mem_zip)
summary(int.jt.conn.a$bias_grp_mem_zip)

## segregated and above-average social capital ##

seg.jt.conn.a <- zip.all.sc %>%
  filter(int_jt_relf == 0 & 
         nbhd_ec_zip > median(nbhd_ec_zip, na.rm=T))

prop.table(table(seg.jt.conn.a$maj_race, useNA = "ifany"))
prop.table(table(seg.jt.conn.a$maj_ses, useNA = "ifany"))
prop.table(table(seg.jt.conn.a$maj_race_ses, useNA = "ifany"))

summary(seg.jt.conn.a$median_hhld_inc)
summary(seg.jt.conn.a$hhld_pov_rt)
summary(seg.jt.conn.a$exposure_grp_mem_zip)
summary(seg.jt.conn.a$bias_grp_mem_zip)

## integrated and average/below social capital ##

int.jt.conn.b <- zip.all.sc %>%
  filter(int_jt_relf == 1 & 
         nbhd_ec_zip <= median(nbhd_ec_zip, na.rm=T))

prop.table(table(int.jt.conn.b$maj_race, useNA = "ifany"))
prop.table(table(int.jt.conn.b$maj_ses, useNA = "ifany"))
prop.table(table(int.jt.conn.b$maj_race_ses, useNA = "ifany"))

summary(int.jt.conn.b$median_hhld_inc)
summary(int.jt.conn.b$hhld_pov_rt)
summary(int.jt.conn.b$exposure_grp_mem_zip)
summary(int.jt.conn.b$bias_grp_mem_zip)

## segregated and average/below social capital ##

seg.jt.conn.b <- zip.all.sc %>%
  filter(int_jt_relf == 0 & 
         nbhd_ec_zip <= median(nbhd_ec_zip, na.rm=T))

prop.table(table(seg.jt.conn.b$maj_race, useNA = "ifany"))
prop.table(table(seg.jt.conn.b$maj_ses, useNA = "ifany"))
prop.table(table(seg.jt.conn.b$maj_race_ses, useNA = "ifany"))

summary(seg.jt.conn.b$median_hhld_inc)
summary(seg.jt.conn.b$hhld_pov_rt)
summary(seg.jt.conn.b$exposure_grp_mem_zip)
summary(seg.jt.conn.b$bias_grp_mem_zip)


### END OF PROGRAM ###

sink()
sink(type = "message")